KPMG’s Analytics, Information & Modelling group helps organisations take the mystery out of big data and show them how to leverage their data resources to produce better business outcomes.
Data Insights
Data is from Sprocket Central Pty Ltd , a medium size bikes & cycling accessories organisation.
Using the existing 3 datasets (Customer demographic, customer address and transactions) as a labelled dataset, please recommend which of these 1000 new customers should be targeted to drive the most value for the organisation.
import numpy as np
import pandas as pd
pd.set_option('max.columns',50)
import datetime
import plotly.graph_objects as go
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
We have cleaned the data in the before module.
transaction = pd.read_excel('clean_kpmg_data.xlsx', sheet_name='Transactions')
cus_list = pd.read_excel('clean_kpmg_data.xlsx', sheet_name='NewCustomerList')
cus_dem = pd.read_excel('clean_kpmg_data.xlsx', sheet_name='CustomerDeomgraphic')
We shall combine all the given data to one single datasheet for final analysis
transaction = transaction.drop(['transaction_id','product_id','Unnamed: 0','product_line','product_size'],axis=1)
# Calculating Profit
transaction['profit'] = transaction['list_price'] - transaction['standard_cost']
transaction = transaction.drop(['list_price','standard_cost'],axis=1)
# Lets transform the approved into numeric column, this will be usefull when we use groupby function
transaction['order_status'] = transaction['order_status'].map({'Approved':int(1),'Cancelled':int(0)})
transaction['product_first_sold_date'] = pd.TimedeltaIndex(transaction['product_first_sold_date'], unit='d') + datetime.datetime(1900,1,1)
most_recent_purchase = transaction['transaction_date'].max()
transaction['last_purchase_days_ago'] = most_recent_purchase - transaction['transaction_date']
transaction['last_purchase_days_ago'] /= np.timedelta64(1, 'D')
# Creating column for each brand this will be usefull for finding out which brand the customer has brought frequently
transaction['WeareA2B'] = transaction['brand'].apply(lambda x: 1 if x=='WeareA2B' else 0)
transaction['Norco_Bicycles'] = transaction['brand'].apply(lambda x: 1 if x=='Norco Bicycles' else 0)
transaction['Giant_Bicycles'] = transaction['brand'].apply(lambda x: 1 if x=='Giant Bicycles' else 0)
transaction['Trek_Bicycles'] = transaction['brand'].apply(lambda x: 1 if x=='Trek Bicycles' else 0)
transaction['Solex'] = transaction['brand'].apply(lambda x: 1 if x=='Solex' else 0)
transaction['OHM_Cycles'] = transaction['brand'].apply(lambda x: 1 if x=='OHM Cycles' else 0)
transaction = transaction.drop('brand',axis=1)
# To calculate how many orders the customer has ordered(frequency) and total profit from the customer.
tran_freq = transaction.groupby('customer_id').sum()
# We don't want the sum of last_purchase_day, so removing the sum and adding the last day of purchase of the customer
tran_freq = tran_freq.drop('last_purchase_days_ago',axis=1)
tran_freq['last_purchase_days_ago'] = transaction.groupby('customer_id').min()['last_purchase_days_ago']
tran_freq.rename(columns = {'order_status':'frequency'}, inplace = True)
cus_dem = cus_dem.drop(['Unnamed: 0','first_name','last_name','DOB','job_title','deceased_indicator'],axis=1)
final_data = cus_dem.merge(tran_freq,how='right',on='customer_id')
final_data['age_category'] = pd.cut(final_data['age'],bins=[0,20,30,40,50,60,70,200],labels=[20,30,40,50,60,70,80])
final_data = final_data[final_data['gender'] != 'Unknown']
final_data.dropna(inplace=True)
final_data.head()
| customer_id | gender | bike_purchase | job_industry | wealth_segment | owns_car | tenure | address | postcode | state | country | property_valuation | age | online_order | frequency | profit | WeareA2B | Norco_Bicycles | Giant_Bicycles | Trek_Bicycles | Solex | OHM_Cycles | last_purchase_days_ago | age_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 93.0 | Health | Mass Customer | Yes | 11.0 | 060 Morning Avenue | 2016.0 | NSW | Australia | 10.0 | 68.0 | 6 | 11 | 3018.09 | 0 | 2 | 4 | 1 | 2 | 2 | 7.0 | 70 |
| 1 | 2 | Male | 81.0 | Financial Services | Mass Customer | Yes | 16.0 | 6 Meadow Vale Court | 2153.0 | NSW | Australia | 10.0 | 41.0 | 1 | 3 | 2226.26 | 2 | 0 | 1 | 0 | 0 | 0 | 128.0 | 50 |
| 3 | 4 | Male | 33.0 | IT | Mass Customer | No | 7.0 | 0 Holy Cross Court | 4211.0 | QLD | Australia | 9.0 | 60.0 | 1 | 2 | 220.57 | 0 | 0 | 1 | 0 | 1 | 0 | 195.0 | 60 |
| 4 | 5 | Female | 56.0 | Others | Affluent Customer | Yes | 8.0 | 17979 Del Mar Point | 2448.0 | NSW | Australia | 4.0 | 44.0 | 2 | 6 | 2394.94 | 1 | 2 | 1 | 1 | 1 | 0 | 16.0 | 50 |
| 5 | 6 | Male | 35.0 | Retail | High Net Worth | Yes | 13.0 | 9 Oakridge Court | 3216.0 | VIC | Australia | 9.0 | 55.0 | 2 | 5 | 3946.55 | 2 | 0 | 2 | 1 | 0 | 0 | 64.0 | 60 |
print('We roughly removed along 670 Ids due to inconsistent data, So our final length of the data is {}'.format(len(final_data)))
We roughly removed along 670 Ids due to inconsistent data, So our final length of the data is 3325
# To avoid getting infinity as answer for score column
final_data['last_purchase_days_ago'] = final_data['last_purchase_days_ago'].apply(lambda x: 0.1 if x==0 else x)
from sklearn.cluster import KMeans
k=5
final_data['score'] = (final_data['profit'] * final_data['frequency']/100) / final_data['last_purchase_days_ago']
#final_data['score'] = (final_data['frequency']) / final_data['last_purchase_days_ago']
kmeans = KMeans(n_clusters=k)
Clusters = kmeans.fit_predict(final_data.loc[:,['profit','score']])
final_data['Customer_Rank'] = Clusters
# Lets check out how many classified into the clusters!!
final_data['Customer_Rank'].value_counts()
4 1059 0 831 1 704 2 563 3 168 Name: Customer_Rank, dtype: int64
final_data['Customer_Rank'] = final_data['Customer_Rank'].map({3:'Royal',2:'Loyal',0:'Friendly',4:'Unsure',1:'Looking-to-Switch'})
col = ['gender','state','wealth_segment','job_industry','owns_car','age_category']
sns.color_palette("Set2")
for i in col:
fig, axs = plt.subplots(1, 2, figsize=(20, 5))
# plot1 vs count
sns.countplot(x = final_data[i], hue=final_data['Customer_Rank'],ax=axs[0],palette='plasma')
# plot2 vs profit
sns.stripplot(x=final_data[i],y='profit',data=final_data,ax=axs[1],palette='viridis')
fig.suptitle(i, fontsize=15, y=1.1)
plt.xticks(rotation=25)
plt.tight_layout(pad=2.0)
plt.show()
plt.figure(figsize=(20,5))
sns.countplot(x = final_data['age'],hue=final_data['gender'] ,palette='plasma')
plt.xticks(rotation=30)
plt.title('Age Distribution')
plt.show()
# Industry vs gender
plt.figure(figsize=(20,10))
sns.countplot(x='job_industry',hue='gender',data=final_data, palette='winter')
<AxesSubplot:xlabel='job_industry', ylabel='count'>
# New south Wales vs gender
sns.countplot(x=final_data[final_data['state'] == 'NSW']['state'],hue='gender',data=final_data, palette='winter')
<AxesSubplot:xlabel='state', ylabel='count'>
There is equal distribution of men and woman in industries and New South Wales
#frequency vs profit
sns.jointplot(x=final_data['profit'],y=final_data['frequency'],kind='hex',color='red',height=8)
<seaborn.axisgrid.JointGrid at 0x16d50e9e4f0>
Greater Number of frequency more the profit
More Frequencies are around 4-6
#Customer_rank vs Brand They bought
a = final_data.groupby('Customer_Rank').sum().loc[:,['WeareA2B','Norco_Bicycles','Giant_Bicycles','Trek_Bicycles','Solex','OHM_Cycles']]
a
| WeareA2B | Norco_Bicycles | Giant_Bicycles | Trek_Bicycles | Solex | OHM_Cycles | |
|---|---|---|---|---|---|---|
| Customer_Rank | ||||||
| Friendly | 935 | 731 | 892 | 776 | 1117 | 782 |
| Looking-to-Switch | 289 | 442 | 421 | 355 | 499 | 397 |
| Loyal | 777 | 548 | 691 | 672 | 968 | 619 |
| Royal | 296 | 177 | 243 | 287 | 318 | 231 |
| Unsure | 829 | 849 | 908 | 761 | 1174 | 864 |
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
fig = go.Figure()
fig.add_trace(go.Bar(
x=a.index,
y=a['WeareA2B'],
name='WeareA2B',
marker_color='mediumorchid'
))
fig.add_trace(go.Bar(
x=a.index,
y=a['Norco_Bicycles'],
name='Norco_Bicycles',
marker_color='mediumseagreen'
))
fig.add_trace(go.Bar(
x=a.index,
y=a['Giant_Bicycles'],
name='Giant_Bicycles',
marker_color='mediumslateblue'
))
fig.add_trace(go.Bar(
x=a.index,
y=a['Trek_Bicycles'],
name='Trek_Bicycles',
marker_color='tomato'
))
fig.add_trace(go.Bar(
x=a.index,
y=a['OHM_Cycles'],
name='OHM_Cycles',
marker_color='mediumturquoise'
))
fig.add_trace(go.Bar(
x=a.index,
y=a['Solex'],
name='Solex',
marker_color='mediumvioletred'
))
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group')
fig.show()
In all three ranks Solex and WeareA2B brand is bought the most